from nucular import Nucular
import os
import shutil
import re
from lxml import html

ROOT_DIR = 'c:/users/vaidhy/Development/scipy/HP_MoR'
SIMPLE_ARCHIVE = 'c:/users/vaidhy/Development/scipy/archive/simple'
titlep = re.compile(r'Chapter\s+(\d+): (.*?) a Harry')
archive = None

def makeArchive():
    # Clean the archive to make sure it is empty
    shutil.rmtree(SIMPLE_ARCHIVE, ignore_errors=True)
    global archive
    archive = Nucular.Nucular(SIMPLE_ARCHIVE)
    archive.create()
    
def addToArchive(file):
    global archive
    tree = html.parse(file)
    field_dict = {}
    #field_dict['desc'] = tree.xpath('/html/head/meta[@name="description"]')[0].attrib['content']
    titletext = tree.xpath('/html/head/title')[0].text
    match = titlep.search(titletext)
    field_dict['chapter'] = match.group(1)
    field_dict['title'] = match.group(2).strip(',')
    field_dict['content'] = tree.xpath('//div[@class="storytext"]')[0].text_content()
    archive.indexDictionary(titletext, field_dict)
    
def finalizeArchive():
    global archive
    archive.store(lazy=False)

def testArchive(w):
    global archive
    if archive == None:
        archive = Nucular.Nucular(SIMPLE_ARCHIVE)
    q = archive.Query()
    q.anyWord(w)
    results = q.resultDictionaries()
    printResults(results)

def printResults(rd):
    for d in rd:
        print "Title :" + d['title']
        print "Chapter : " + d['chapter']
    
    
if __name__ == '__main__':
    makeArchive()
    for f in os.listdir(ROOT_DIR):
        addToArchive(ROOT_DIR + os.sep + f)
    finalizeArchive()
    testArchive('parseltongue')
    

